# R script for the Homework 3 of 19-704 course
# The goal is to replicate and extend the results from Anglin and Gencay (1996)

# Set working directory
# setwd("/Users/Alexandra/Documents/PhD CMU/Courses/S15 _ 19-704 - Data Analysis/Homeworks/HW3/")
 
# Load the relevant libraries and data sets
library(AER)
data(HousePrices)

# Taking a quick look at the data
summary(HousePrices)
head(HousePrices)

# Part One: Replication  --------------------------------------------

# Question 1 ----------------------------------------------

# Create dummy variables
hprice <- data.frame(HousePrices)
summary(hprice)
head(hprice)
hprice <- transform(HousePrices, DRV = as.numeric(driveway) - 1)
hprice <- transform(hprice, REC = as.numeric(recreation) - 1)
hprice <- transform(hprice, FFIN = as.numeric(fullbase) - 1)
hprice <- transform(hprice, GHW = as.numeric(gasheat) - 1)
hprice <- transform(hprice, CA = as.numeric(aircon) - 1)
hprice <- transform(hprice, GAR = garage)
hprice <- transform(hprice, REG = as.numeric(prefer) - 1)
hprice <- transform(hprice, LOT = lotsize)
hprice <- transform(hprice, BDMS = bedrooms)
hprice <- transform(hprice, FB = bathrooms)
hprice <- transform(hprice, STY = stories)
head(hprice)
summary(hprice)

# Regression for Table II
hp.reg1 <- lm(log(price) ~ DRV + REC + FFIN + GHW + CA + GAR
              + REG + log(LOT) + log(BDMS) + log(FB)
              + log(STY), data = hprice)
summary(hp.reg1)

# Part Two: Extension ----------------------------------------------------------------

# Question 1 --------------------------------------------------------

# Create histogram for house prices
png(filename = "HW3_Cosseron_histogram1_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3,
    mar = c(5, 4, 1, 1),
    oma = (0, 0, 0, 0))
hist(hprice$price,
     prob = FALSE,
     breaks = "fd",
     xlim = c(20000, 200000),
     ylim = c(0, 120),
     xlab = "House prices ($)",
     main = "Histogram of House Prices")
rug(hprice$price, col = rgb(1, 0, 0, .3))
dev.off()

# Create histogram for log house prices
lprice <- log(hprice$price)
png(filename = "HW3_Cosseron_histogram2_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3,
    mar = c(5, 4, 1, 1))
hist(lprice,
     prob = FALSE,
     breaks = "FD",
     xlim = c(10, 12.5),
     ylim = c(0, 120),
     xlab = "Log House prices",
     main = "Histogram of Log House Prices")
rug(lprice, col = rgb(1, 0, 0, .3))
dev.off()

# Create histogram for lot size
png(filename = "HW3_Cosseron_histogram3_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3,
    mar = c(5, 4, 1, 1))
hist(hprice$lotsize,
     prob = FALSE,
     breaks = "FD",
     xlim = c(1500, 20000),
     ylim = c(0, 120),
     xlab = "Lot Size (in square feet)",
     main = "Histogram of Lot Size")
rug(hprice$lotsize, col = rgb(1, 0, 0, .3))
dev.off()

# Create histogram for the number of bedrooms
png(filename = "HW3_Cosseron_histogram4_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3,
    mar = c(5, 4, 1, 1))
hist(hprice$BDMS,
     prob = FALSE,
     breaks = "FD",
     xlim = c(1, 6),
     ylim = c(0, 350),
     xlab = "Number of Bedrooms",
     main = "Histogram of the Number of Bedrooms")
rug(hprice$BDMS, col = rgb(1, 0, 0, .3))
dev.off()

# Create histogram for the number of full bathrooms
png(filename = "HW3_Cosseron_histogram5_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3,
    mar = c(5, 4, 1, 1))
hist(hprice$FB,
     prob = FALSE,
     breaks = "FD",
     xlim = c(1, 4),
     ylim = c(0, 500),
     xlab = "Number of Full Bathrooms",
     main = "Histogram of the Number of Full Bathrooms")
rug(hprice$FB, col = rgb(1, 0, 0, .3))
dev.off()

# Create histogram for the number of stories
png(filename = "HW3_Cosseron_histogram6_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3,
    mar = c(5, 4, 1, 1))
hist(hprice$STY,
     prob = FALSE,
     breaks = "FD",
     xlim = c(1, 4),
     ylim = c(0, 300),
     xlab = "Number of Stories",
     main = "Histogram of the Number of Stories")
rug(hprice$STY, col = rgb(1, 0, 0, .3))
dev.off()

# Summaries
summary(hprice$price)
summary(lprice)
summary(hprice$lotsize)
summary(hprice$BDMS)
summary(hprice$FB)
summary(hprice$STY)

# Question 2 ---------------------------------------------------------------------------------

# Create tables of the driveway, recreational room, full and furnished basement,
# gas for hot water heating, central air conditioning, ans preferred neighborhood
# dummy variables
table(hprice$DRV)
table(hprice$REC)
table(hprice$FFIN)
table(hprice$GHW)
table(hprice$CA)
table(hprice$REG)

# QUestion 4 ---------------------------------------------------------------------------------

# Plot the jackknife residuals versus fitted values for the benchmark regression
hp.reg2 <- lm(price ~ DRV + REC + FFIN + GHW + CA + GAR
              + REG + log(LOT) + BDMS + FB
              + STY, data = hprice)
summary(hp.reg2)

png(filename = "HW3_Cosseron_Q4_regJackknife.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
plot(jitter(fitted(hp.reg2)),
     jitter(rstudent(hp.reg2)),
     xlab = "Fitted Values",
     ylab = "Jackknife Residuals",
     pch = 19,
     col = rgb(0, 0, 0, .3),
     ylim = c(-5, 6),
     xlim = c(20000, 150000))
abline(h = 0)
dev.off()

# Box-Cox transformation 
library(car)
png(filename = "HW3_Cosseron_Q4_boxCox.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
hp.bc <- boxCox(price ~ DRV + REC + FFIN + GHW + CA + GAR
                + REG + log(LOT) + BDMS + FB
                + STY, 
                data = hprice)
dev.off()

max(hp.bc$y)
hp.bc$x[53]

# Question 5 -----------------------------------------------------------------------

# Plot the component plus residual
hp.reg3 <- lm(log(price) ~ DRV + REC + FFIN + GHW + CA + GAR
              + REG + LOT + BDMS + FB
              + STY, data = hprice)
summary(hp.reg3)

x <- hprice$lotsize
y <- lprice

png(filename = "HW3_Cosseron_complusresid.png",
    width = 3000,
    height = 3000,
    res = 300)
par(mfrow = c(2, 2),
    cex = 1.3,
    mar = c(4, 4, 1, 1))
plot(x, y,
     pch = 19,
     col = rgb(0, 0, 0, .5),
     ylab = "Log house prices",
     xlab = "Lot size")
# Plot the residual plus component against x
plot(x, resid(hp.reg3) + coef(hp.reg3)[9]*x,
     pch = 19,
     col = rgb(0, 0, 0, .5),
     ylab = "Component + Residual",
     xlab = "Lot size")
# Plot the model residual versus x
plot(x, resid(hp.reg3),
     pch = 19,
     col = rgb(0, 0, 0, .5),
     ylab = "Residual",
     xlab = "Lot size")
# Plot the component versus x
plot(x, coef(hp.reg3)[9]*x,
     pch = 19,
     col = rgb (0, 0, 0, .5),
     ylab = "Component",
     xlab = "Lot size")
dev.off()

# Box-Tidwell Power Transformation
library(car)
boxTidwell(lprice ~ hprice$lotsize)

# Question 6 --------------------------------------------------------

# Create a semi-parametric Generalized Additive Model (GAM)
install.packages("mgcv", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(mgcv)

x <- hprice$lotsize
y <- lprice

gam1 <- gam(y ~ s(x))

png(filename = "hw3_Cosseron_gam1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
plot(x, y, 
     pch = 19, 
     col = rgb(0, 0, 0, .3),
     ylab = "Log house prices",
     xlab = "Lot size")
lines(sort(x),
      predict(gam1,
              newdata = data.frame(x = sort(x))),
      col = "green",
      lwd = 2)
dev.off()

# Plot partial residuals for lot size from GAM
 png(filename = "HW3_Cosseron_partialresid.png",
     width = 3000,
     height = 3000,
     res = 300)
par(cex = 1.3)
plot(gam1,
     residuals = TRUE,
     shade     = TRUE,
     pages = 1,
     scale = 0,
     cex = 3)
dev.off()

# Question 7 --------------------------------------------------------------------------------------

# Regression for the new model - using power transformation of lot size
hprice <- transform(hprice, tLOT = LOT^(-0.176))
hp.reg4 <- lm(log(price) ~ DRV + REC + FFIN + GHW + CA + GAR
              + REG + tLOT + BDMS + FB
              + STY, data = hprice)
summary(hp.reg4)

z <- hprice$tLOT

gam2 <- gam(y ~ s(z))

png(filename = "HW3_Cosseron_gam2.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
plot(gam2,
     residuals = TRUE,
     shade     = TRUE,
     pages     = 1,
     scales    = 0,
     cex       = 3,
     xlab = "Transformed Lot size variable")
dev.off()

# Compare models using 5-fold cross-validation
hprice <- transform(hprice, lprice = log(price))
nfolds <- 5
case.folds <- rep(1:nfolds, length.out = nrow(hprice))
set.seed(4)
case.folds <- sample(case.folds)

mymodel <- c()
lmodel <- c()

for (fold in 1:nfolds){
  train <- hprice[case.folds != fold, ]
  test <- hprice[case.folds == fold, ]
  lmodel.train <- lm(lprice ~ log(LOT), data = hprice)
  mymodel.train <- lm(lprice ~ tLOT, data = hprice)
  lmodel.test <- (test$lprice - predict(lmodel.train, test))^2
  mymodel.test <- (test$lprice - predict(mymodel.train, test))^2
  rMSEtest1 <- sqrt(sum(lmodel.test)/length(lmodel.test))
  rMSEtest2 <- sqrt(sum(mymodel.test)/length(mymodel.test))
  lmodel <- append(spmodel, rMSEtest1)
  mymodel <- append(mymodel, rMSEtest2)
}
lmodel.avg <- mean(spmodel)
mymodel.avg <- mean(mymodel)

# Question 8 -----------------------------------------------------------
hp.reg5 <- lm(log(price) ~ DRV + REC + FFIN + GHW + CA + GAR
              + REG + log(LOT) + BDMS + FB
              + STY, data = hprice)
summary(hp.reg5)

# Get the squared residual
hp.reg5.resid.sq <- resid(hp.reg5)^2
x <- hprice$DRV + hprice$REC + hprice$FFIN + hprice$GHW + hprice$CA 
+ hprice$GAR + hprice$REG + log(hprice$LOT) + hprice$BDMS + hprice$FB
+ hprice$STY
x.sq.dist <- (x - mean(x))^2

# Calculate the robust standard error
hp.reg5.robust <- sqrt((sum(hp.reg5.resid.sq*x.sq.dist))/sum(x.sq.dist)^2)

# Calculate the classical standard error
mean(summary(hp.reg5)$coefficients[, 2])
